This script creates a correlation matrix between all variables of our data set, using Cramers V and the the sample we will also use for model specification. Further, we will create groups for highly correlated Variables and select one to represent this group.
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## .
## FALSE
## 2260
sample <- sample %>% as.data.frame()
### filter with factors over 15
idx <- sapply(sample,FUN = function(x) length(unique(x)) > 15)
idx %>% table()## .
## FALSE TRUE
## 128 36
sample2 <- sample[, !idx]
### filter with na percentage higher then .5
idx <- sapply(sample2,FUN = function(x) mean(is.na(x)) > .5)
idx %>% table()## .
## FALSE TRUE
## 95 33
sample2 <- sample2[, !idx]
### other non informative variables filter
sample2 <- sample2[,!colnames(sample2) %in% c("idep","imun")]
sample_fast <- data.frame(lapply(sample2, function(x) {
if (is.factor(x)) as.numeric(x) else x
}))
sample2## Loading required package: grid
tmp_data <- matrix(ncol = 3,nrow = 0) %>% as.data.frame()
colnames(tmp_data) <- c("v1","v2","cramresV")
for(i in seq_along(sample2)){
for(j in seq_along(sample2)){
x1 <- sample2[,i] %>% as.factor()
x2 <- sample2[,j] %>% as.factor()
tbl <- table(x1, x2)
tmp_vec <- c(colnames(sample2)[i],colnames(sample2)[j],assocstats(tbl)$cramer[[1]]) %>% unlist()
tmp_data <- rbind(tmp_data,tmp_vec)
}
}
colnames(tmp_data) <- c("v1","v2","cramresV")
This is a histogramm showing the distribution of the correlation between
all variables.
cram_mat <- tmp_data %>%
filter(v1 != v2) %>% # remove self-pairs
pivot_wider(
names_from = v2,
values_from = cramresV
) %>%
column_to_rownames("v1") %>%
as.matrix()
dim(cram_mat)## [1] 93 93
cram_long <- melt(cram_mat)
corr_matrix_2 <- ggplot(cram_long, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_viridis_c() +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, size = 6),
axis.text.y = element_text(size = 6)
) +
labs(fill = "Cramér's V")
corr_matrix_2ggsave("../output/corr_matrix-2.png",plot = corr_matrix_2,width = 10,height = 6, device = "png",dpi = 300)dat_selection <- tmp_data
### drop the pair with itself
dat_selection <- dat_selection[dat_selection$v1 != dat_selection$v2,]
dat_selection <- dat_selection[!is.na(dat_selection$cramresV),]
dat_selection$cramresV %>% hist()### load in list from. A
list_variables <- readRDS("../data_raw/misc/list_variables_seleciton.RDS")
### filter for only these variables
dat_selection <- dat_selection[dat_selection$v1 %in% list_variables,]
dat_selection$pair_sorted <- apply(dat_selection[, c("v1","v2")], 1, function(x) paste(sort(x), collapse = "_"))
dat_selection_unique <- dat_selection[!duplicated(dat_selection$pair_sorted), ]
dat_selection_unique[,1:3] %>% reactable(.,compact = T,filterable = T)# dat_selection_unique: v1, v2, cramresV
threshold <- 0.75
# keep only strong pairs
strong_pairs <- dat_selection_unique %>% filter(cramresV > threshold)
# build graph
g <- graph_from_data_frame(strong_pairs[, c("v1", "v2")], directed = FALSE)
# find connected components
comp <- components(g)
# create a dataframe with groups
group_df <- data.frame(
var = names(comp$membership),
group = comp$membership
)
# optional: collapse variables in each group into one row
grouped_vars <- group_df %>%
group_by(group) %>%
summarise(vars = paste(var, collapse = ", "), .groups = "drop")
reactable(grouped_vars)